#!/usr/bin/perl
#
# $OpenBSD: mdoclint,v 1.44 2014/06/24 12:41:19 jmc Exp $
# $NetBSD: mdoclint,v 1.49 2014/06/23 18:10:21 wiz Exp $
#
# Copyright (c) 2001-2013 Thomas Klausner
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR, THOMAS KLAUSNER,
# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#

use strict;
use warnings;

$| = 1;

package Parser;
use Getopt::Std;

use constant {
	OPENBSD => 1,
	NETBSD => 0,
	SECTION_SEE_ALSO => 2,
	SECTION_AUTHORS => 3
};

use vars qw(
	$opt_A $opt_a $opt_D $opt_d $opt_e $opt_F $opt_f $opt_H $opt_h $opt_l
	$opt_m
	$opt_n $opt_o $opt_P $opt_p $opt_r $opt_S $opt_s $opt_v $opt_w
	$opt_X $opt_x
);


my $arch=`uname -m`;
chomp($arch);
my $options="AaDdeFfHhlmnoPprSsvwXx";

sub usage
	{
	my $default = OPENBSD ? "-AaDdfmnoPprSsXx" : "-AaDdeflmnoPprSsXx";

	print STDERR <<"EOF";
mdoclint: verify man page correctness
usage:	mdoclint [-$options] file ...
	-A	warn about missing .An in AUTHORS section
	-a	warn about SEE ALSO section problems
	-D	warn about bad casing and archs in .Dt
	-d	warn about bad date strings (in .Dd only)
	-e	warn about unsorted errors (for functions)
	-F	fix whitespace problems (asks before overwriting)
	-f	warn about possible incorrect .Fn syntax
	-H	warn about characters that produce problems in HTML output
	-h	display this help text
	-l	warn about unknown libraries
	-m	warn about man pages that are not in mdoc(7) format
	-n	warn about .Nd's ending in '.'
	-o	warn about non-empty .Os strings
	-P	warn about paragraph problems
	-p	warn about punctuation problems
	-r	warn about missing RCS Id
	-S	warn about any .Sh weirdness
	-s	warn about whitespace problems
	-v	verbose output
	-w	show section header in warnings
	-X	warn about explicit mentions of the *BSD system names
	-x	warn about cross-references with missing targets
Default is $default if no flag is specified.
EOF
	exit(0);
}


my %short = (
    "Free" => ".Fx",
    "Net" => ".Nx",
    "DragonFly" => ".Dx",
    "Open" => ".Ox"
);

my %libraries = (
    "libarchive" => 1,
    "libarm" => 1,
    "libarm32" => 1,
    "libbluetooth" => 1,
    "libc" => 1,
    "libcdk" => 1,
    "libcompat" => 1,
    "libcrypt" => 1,
    "libcurses" => 1,
    "libdm" => 1,
    "libedit" => 1,
    "libelf" => 1,
    "libevent" => 1,
    "libexecinfo" => 1,
    "libfetch" => 1,
    "libform" => 1,
    "libi386" => 1,
    "libintl" => 1,
    "libipsec" => 1,
    "libiscsi" => 1,
    "libisns" => 1,
    "libkvm" => 1,
    "libm" => 1,
    "libm68k" => 1,
    "libmagic" => 1,
    "libmandoc" => 1,
    "libmenu" => 1,
    "libmj" => 1,
    "libnetpgp" => 1,
    "libnetpgpverify" => 1,
    "libnpf" => 1,
    "libossaudio" => 1,
    "libpam" => 1,
    "libpcap" => 1,
    "libpci" => 1,
    "libperfuse" => 1,
    "libpmc" => 1,
    "libposix" => 1,
    "libppath" => 1,
    "libprop" => 1,
    "libpthread" => 1,
    "libpuffs" => 1,
    "libquota" => 1,
    "librefuse" => 1,
    "libresolv" => 1,
    "librt" => 1,
    "librumpclient" => 1,
    "libsaslc" => 1,
    "libssp" => 1,
    "libtermcap" => 1,
    "libterminfo" => 1,
    "libusbhid" => 1,
    "libutil" => 1,
    "libx86_64" => 1,
    "libz" => 1
);

# constants to build
my %sections;
my $arches_re;
my $sections_re;
my $esections_re;
my $valid_date_re;
# and the code that builds them
{
	my @sections = (
		"NAME",
		NETBSD ? "LIBRARY" : undef,
		"SYNOPSIS",
		"DESCRIPTION",
		OPENBSD ? "CONTEXT" : undef ,
		NETBSD ? "IMPLEMENTATION NOTES" : undef,
		"RETURN VALUES",
		"ENVIRONMENT",
		"FILES",
		"EXIT STATUS",
		"EXAMPLES",
		"DIAGNOSTICS",
		NETBSD ? "COMPATIBILITY" : undef,
		"ERRORS",
		NETBSD ? "CODE REFERENCES" : undef,
		"SEE ALSO",
		"STANDARDS",
		"HISTORY",
		"AUTHORS",
		"CAVEATS",
		"BUGS",
		NETBSD ? "SECURITY CONSIDERATIONS" : undef
	);

	my $i = 1;
	for my $sh (@sections) {
		if (defined $sh) {
			$sections{$sh} = $i++;
		}
	}
	my @arches;
	if (OPENBSD) {
		@arches =
		    (qw(alpha amd64 arm armish armv7 aviion cats hppa
		    hppa64 i386 landisk loongson luna88k macppc mips64
		    octeon sgi socppc sparc sparc64 vax
		    zaurus));
	}
	if (NETBSD) {
		@arches =
		    (qw(acorn26 acorn32 algor alpha amiga arc atari
		    bebox cats cesfic cobalt dreamcast evbarm evbmips evbppc
		    evbsh3 evbsh5 hp300 hpcarm hpcmips hpcsh hppa
		    i386 ibmnws luna68k mac68k macppc mipsco mmeye
		    mvme68k mvmeppc netwinder news68k newsmips next68k
		    pc532 playstation2 pmax pmppc prep sandpoint sbmips
		    sgimips shark sparc sparc64 sun2 sun3 vax walnut
		    x68k x86 x86_64 xen));
	}
	my $a = join('|', @arches);
	$arches_re = qr{(?:$a)}o;
	if (OPENBSD) {
		$sections_re = qr{(?:3p|[1-9])}o;
		$esections_re = qr{(?:3p|[0-9])}o;
	}
	if (NETBSD) {
		$sections_re = qr{[1-9](?:lua)?}o;
		$esections_re = qr{[0-9](?:lua)?}o;
	}
	if (OPENBSD) {
		$valid_date_re = qr{\$Mdocdate\b};
	}
	if (NETBSD) {
		$valid_date_re = qr{(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*[1-9][0-9]*,\s*(?:198[0-9]|199[0-9]|200[0-9]|201[0-4])$}o;
	}
}

sub debug
{
	my $self = shift;
    	print STDOUT "debug: $self->{fn}:$self->{ln}: @_\n" if $opt_v;
}

sub warning
{
	my $self = shift;
	my $extra = "";
	if ($opt_w) {
		$extra = $self->{current_section_header}.":";
	}
	print STDOUT "$self->{fn}:$extra$self->{ln}: ", join('', @_), "\n";
}

sub handle_options
{
	getopts($options);
	$opt_h and usage();

	# default to all warnings if no flag is set
	unless ($opt_A or $opt_a or $opt_D or $opt_d or $opt_e
		or $opt_f or $opt_H or $opt_l
		or $opt_m or $opt_n
		or $opt_o or $opt_P or $opt_p or $opt_r
		or $opt_S or $opt_s or $opt_X or $opt_x) {
		$opt_A = $opt_a = $opt_D = $opt_d = $opt_f = $opt_m =
		    $opt_n = $opt_o = $opt_P = $opt_p = $opt_r = $opt_S =
		    $opt_s = $opt_X = $opt_x = 1;
		$opt_e = $opt_l = 1 if NETBSD;
	}
}


sub verify_xref
{
	my ($self, $page, $section, $pre, $post) = @_;
	if ("$page.$section" eq $self->{fn}) {
		$self->warning("Xref to itself (use .Nm instead)") if $opt_x;
	}
	# try to find corresponding man page
	for my $dir ("/usr/share/man",
	    OPENBSD ? "/usr/X11R6/man" : "/usr/X11R7/man") {
		for my $a ("", $arch) {
			for my $page ("man$section/$a/$page.$section") {
				return 1 if -f "$dir/$page";
			}
		}
	}
	return 1 if -f "./$page.$section";

	$self->warning($pre."trailing Xref to $page($section)$post") if $opt_x;
	return 0;
}

sub new
{
	my ($class, $fn) = @_;

	my $o = {
		mandoc_p => 1,
		all => [],
		lastline => '',
		changes => 0,
		oxrcsidseen => 0,
		nxrcsidseen => 0,
		lastsh => 0,
		sasection => 0,
		saname => '',
		sarest => ',',
		inauthors => 0,
		in_section => 0,
		inliteral => 0,
		shseen => {},
		last_error_name => '',
		current_section_header => '',
		fn => $fn
	};
	open my $input, '<', $fn or die "can't open input file $fn";
	$o->{file} = $input;
	$o->{ln} = 0;
	bless $o, $class;
}

sub next_line
{
	my ($self) = @_;

	my $l = readline($self->{file});
	if (defined $l) {
		$self->{ln}++;
	}
	return $l;
}

sub close
{
	my ($self) = @_;

	close($self->{file});
}

sub parse_macro_args
{
	my ($s, $string) = @_;
	$_ = $string;
	my @params = ();
	while (!/^$/) {
		if (s/^\"(.*?)\"\s*//) {
			push(@params, $1);
		} elsif (s/^(\S+)\s*//) {
			push(@params, $1);
		}
	}
	if (@params > 9 and OPENBSD) {
		$s->warning("$string holds >9 parameters");
	}
	return @params;
}

sub end_of_section
{
	my ($s) = @_;

	if ($s->{in_section} == SECTION_SEE_ALSO and not $s->{sarest} eq "") {
		$s->warning("unneeded characters at end of SEE ALSO: ",
		    "`$s->{sarest}'") if $opt_a;
		# to avoid a second warning at EOF
		$s->{sarest} = "";
	}

	if ($s->{in_section} == SECTION_AUTHORS) {
	    if (!$s->{an_found}) {
		$s->warning("missing .An in AUTHORS section") if $opt_A;
	    }
	}
}

sub set_section_header
{
	my ($s, $section_header) = @_;
	$section_header = join(' ', $s->parse_macro_args($section_header));

	end_of_section($s);

	if ($section_header eq 'SEE ALSO') {
		$s->{in_section} = SECTION_SEE_ALSO;
	} elsif ($section_header eq 'AUTHORS') {
		$s->{in_section} = SECTION_AUTHORS;
	} else {
		$s->{in_section} = 0;
	}

	if (not $sections{$section_header}) {
		$s->warning("unknown section header: ",
		    "`$section_header'") if $opt_S;
	} else {
		if ($s->{lastsh} >= $sections{$section_header}) {
			$s->warning("section header ",
			    "`$section_header' in wrong order") if $opt_S;
		}
		$s->{shseen}->{$section_header} = 1;
		$s->{lastsh} = $sections{$section_header};
	}

	if ($s->{lastline} =~ /^\.Pp/o) {
		$s->warning("Paragraph problem: section header after .Pp")
		    if $opt_P;
	}

	$s->{current_section_header} = $section_header;
}

sub process_and_save_line
{
	my ($s, $input) = @_;
	my $result = $s->process_line($input);
	# note that process_line chomps \n, then re-adds it,
	# so we detect a change on last lines without a \n.
	if ($result ne "$input") {
		$s->{changes} = 1;
	}
	push(@{$s->{all}}, $result);
}

sub process_line
{
	my $s;
	($s, $_) = @_;
	chomp;
	# always cut trailing spaces
	if (/\s+$/o) {
		$s->warning("trailing space: `$_'") if $opt_s;
		s/\s+$//o;
	}
	if (/\$OpenBSD\b.*\$/o) {
		$s->{oxrcsidseen}++;
		if (OPENBSD and ($s->{oxrcsidseen} > 1)) {
		    $s->warning("RCS Id seen twice") if $opt_r;
		}
		return "$_\n";
	}
	if (/[\$]NetBSD\b.*\$/o) {
		$s->{nxrcsidseen}++;
		if (NETBSD and ($s->{nxrcsidseen} > 1)) {
		    $s->warning("RCS Id seen twice") if $opt_r;
		}
		return "$_\n";
	}
	# comments
	if (/^\.\\\"/) {
		return "$_\n";
	}
	if (/^\.TH\s+/o) {
		$s->warning("not mandoc") if $opt_m;
		$s->{mandoc_p} = 0;
#	    	/^.TH\s*[\w-_".]+\s*([1-9])/;
#	    	$section = $1;
		return "$_\n";
	}
#	if (/^.Dt\s*[\w-_".]+\s*([1-9])/) {
#	    	$section = $1;
#	}
	if (/^\.Dt\s+/o) {
	    if (! /^\.Dt\s+(?:[A-Z\d._-]+)\s+$sections_re(?:\s+$arches_re)?$/o)  {
		    $s->warning("bad .Dt: `$_'") if $opt_D;
	    }
	}

	if ($s->{mandoc_p}) {
		if (/^\.Sh\s+(.*)$/o) {
			my $line = $_;
			$s->set_section_header($1);
			return "$line\n";
		}
	} else {
		if (/^\.SH\s+(.*)$/o) {
			my $line = $_;
			$s->set_section_header($1);
			return "$line\n";
		}
	}

	if ($s->{in_section} == SECTION_SEE_ALSO) {
		if (/^\.Xr\s+(\S+)\s+($sections_re)\s?(.*)?$/o) {
			my ($saname, $sasection, $sarest) = ($1, $2, $3);
			$saname =~ s/^\\&//o;
			if ($s->{sasection} gt $sasection
			    or ($s->{sasection} eq $sasection and
			    (lc($s->{saname}) gt lc($saname)))) {
				$s->warning("SEE ALSO: `.Xr $s->{saname} ",
				    "$s->{sasection}' should be after ",
				    "`.Xr $saname $sasection'") if $opt_a;
			}
			if ($s->{sarest} ne ",") {
				$s->warning("SEE ALSO: .Xr not separated ",
				    "by comma, but `$s->{sarest}'") if $opt_a;
			}
			$s->{saname} = $saname;
			$s->{sasection} = $sasection;
			$s->{sarest} = $sarest;
		}
		if (/^\.Rs(?:\s+|$)/o) {
			if ($s->{sarest} ne "") {
				$s->warning("SEE ALSO: Not necessary to ",
				    "separate .Xr from .Rs by ",
				    "`$s->{sarest}'") if $opt_a;
			}
			$s->{sarest} = "";
		}
	}
	if ($s->{in_section} == SECTION_AUTHORS) {
		if (/^\.An / && not /^\.An -(no|)split/) {
			$s->{an_found} = 1;
		}
	}

	if (/^\.Fn.*,.+/o) {
		$s->warning("possible .Fn misuse: `$_'") if $opt_f;
	}
	if (OPENBSD) {
		if (/^(?:[<>])/o or /[^\\][<>]/o) {
			$s->warning("use \*(Lt \*(Gt (or .Aq) ",
			    "instead of < >: `$_'") if $opt_H;
		}
	}
	if (NETBSD) {
		if (/^(?:[<>&])/o or /[^\\][<>&]/o) {
			$s->warning("use \*[Lt] \*[Gt] (or .Aq) \*[Am] ",
			    "instead of < > &: `$_'") if $opt_H;
		}
	}

	if (/\b(Free|Net|DragonFly|Open)BSD\b/o
	    and not /\b(?:www|ftp)\.(?:Free|Net|DragonFly|Open)BSD\.org\b/o
	    and not /\bOpenBSD\::.*3p\b/o
	    and not /\/pub\/OpenBSD\//o
	    and not /\@(?:Free|Net|Open)BSD\.(?i:org)\b/o) {
		$s->warning("verbose mention of `$1BSD' instead of "
		    . "`$short{$1}': `$_'") if $opt_X;
	}
	if (/^\./o and (/Bx (Open)/o or /Bx (Free)/o
			or /Bx (DragonFly)/o or /Bx (Net)/o)) {
		$s->warning("`.Bx $1' found -- use $short{$1} instead")
		    if $opt_X;
	}
	if (/^\.Lb\s+(\S+)/o) {
		if (not $libraries{$1}) {
			$s->warning("Unknown library `$1' used as Lb argument") if $opt_l;
		}
	}
	if (/^\.Os\s+(.+)/o) {
		$s->warning(".Os used with argument `$1'") if $opt_o;
	}

	if (/^\.Nd.*\.$/o) {
		$s->warning(".Nd ends with a dot: `$_'") if $opt_n;
	}

	if (/(\w\w)\.\s+[A-Z]/o and not /^.%T/ and not $s->{inliteral}) {
		if ("$1" ne "St") {
			$s->warning("new sentence, new line: `$_'") if $opt_p;
		}
	}
	if (/^\... .*[^\s][\.();,\[\]\{\}:]$/o
	    and not /\s\.\.\.$/o and not /\\&.$/o) {
		$s->warning("punctuation in format string ",
		    "without space: `$_'") if $opt_p;
	}
	if (/^\./o and /Ns [\.();,\[\]\{\}:]/o) {
		$s->warning("possible Ns abuse: `$_'") if $opt_p;
	}
	if ((/^([^\.]\w+)\(\)/o or /^[^\.].*\s(\w+)\(\)/o) and not $s->{inliteral}) {
		$s->warning("use .Fn or .Xr for functions: `$1()'") if $opt_p;
	}

	my $destruct = $_;
	if ($s->{mandoc_p}) {
		$destruct =~ s/\\\&([\w\.])/$1/o;
		if ($destruct =~ /^\.Xr\s+([\w\:\.\-\+\/]+)\s+($esections_re)(.*)/o) {
			$s->debug("Xref to $1($2) found: `$_'");
			$s->verify_xref($1, $2, "", "");
			if ($3 =~ /^\S/o) {
				$s->warning("No space after section number in Xref: `$_'") if $opt_x;
			}
		} elsif ($destruct =~ /^\.Xr/o) {
			$s->warning("Weird Xref found: `$_'") if $opt_x;
		}
	} else {
		$destruct =~ s/\\f.//go;
		if ($destruct !~ /^\.\\\"/o) {
			while ($destruct =~ s/([-\w.]+)\s*\(($esections_re)\)//o) {
				$s->debug("possible Xref to $1($2) found: `$_'");
				$s->verify_xref($1, $2, "possible ", ": `$_'");
				# so that we have a chance to find more than one
				# per line
				$destruct =~ s/(\w+)\s*\(($sections_re)\)//o;
			}
		}
	}

	if (/^\.Dd/o and not /^\.Dd\s+$valid_date_re/o) {
		$s->warning("Invalid date found: `$_'") if $opt_d;
	}

	if (/^\.Bd\b.*-literal\b/o) {
		$s->{inliteral} = 1;
	}
	if ($s->{inliteral} == 1) {
		if (/^\.Ed\b/o) {
		    $s->{inliteral} = 0;
		}
	} elsif (/^$/o) {
		$s->warning("Paragraph problem: empty line -- ",
		    "use .Pp for paragraphs") if $opt_P;
	}
	if ($s->{lastline} =~ /^\.Pp/o and /^(\.Ss|\.Pp)/o) {
		$s->warning("Paragraph problem: $1 after .Pp") if $opt_P;
	}
	if (/^\.Pp/o and $s->{lastline} =~ /^(\.S[Ssh])/o) {
		$s->warning("Paragraph problem: .Pp after $1") if $opt_P;
	}

	# Check whether the list of possible errors for a function is
	# sorted alphabetically.
	#
	# Error names should not be sorted across different lists.
	# (see bind(2) for an example.)
	#
	/^\.Bl\s+/o and $s->{last_error_name} = "";

	if ($s->{current_section_header} eq "ERRORS" and
	    /^\.It\s+(Bq\s+)?(Er\s+)?(E[\w_]+)$/o) {
		my $current_error_name = $3;

		if ($s->{last_error_name} eq $current_error_name) {
			$s->warning("Duplicate item for ",
			    $current_error_name, ".") if $opt_e;
		} elsif ($current_error_name lt $s->{last_error_name}) {
			$s->warning("$s->{last_error_name} and ",
			    "$current_error_name are not in ",
			    "alphabetical order.") if $opt_e;
		}
		$s->{last_error_name} = $current_error_name;
	}

	$s->{lastline} = $_;
	return "$_\n";
}

sub finish
{
	my ($s) = @_;

	if (NETBSD and not $s->{nxrcsidseen}) {
		$s->warning("Missing RCS Id") if $opt_r;
	}
	if (OPENBSD and not $s->{oxrcsidseen}) {
		$s->warning("Missing RCS Id") if $opt_r;
	}

	if ($s->{lastline} =~ /^\.Pp/o) {
		$s->warning("Paragraph problem: .Pp at EOF") if $opt_P;
	}

	end_of_section($s);

#    	if (not ($fn =~ /$section$/)) {
#		$s->warning("section doesn't match (internal value: $section)");
#    	}
	if ($s->{mandoc_p}) {
		foreach my $i (qw(NAME SYNOPSIS DESCRIPTION)) {
			if (not ($s->{shseen}{$i})) {
				$s->warning("missing $i section") if $opt_S;
			}
		}
	}
}

package main;

sub handle_file
{
	my $parser = Parser->new($_[0]);

	while ($_ = $parser->next_line) {
		$parser->process_and_save_line($_);
	}

	$parser->finish;
	$parser->close;
	if ($Parser::opt_F and $parser->{changes}) {
		open OUT, ">$_[0].new" or
		    die "can't open output file `$_[0].new'";
		for my $l (@{$parser->{all}}) {
			print OUT $l
		}
		close OUT;
		system("mv -i $_[0].new $_[0]");
	}
}

Parser->handle_options;
foreach my $file (@ARGV) {
	handle_file($file);
}
